/*** ADDS CORE SUBJECT COURSE RECORDS AND CREATES SUBJECT-SPECIFIC DATASETS ***/

clear all
set more 1

do "paths.do"

capture log close
log using "$LOGFILES\tracking_4.log", replace


**********************************************
*** Identify math and other core courses *****
**********************************************

** start with codebook for courses
use "$masterdata\p_service11", clear
gen year=2011
append using "$masterdata\p_service12"
replace year=2012 if year==.
append using "$masterdata\p_service13"
replace year=2013 if year==.
append using "$masterdata\p_service14"
replace year=2014 if year==.
append using "$masterdata\p_service15"
replace year=2015 if year==.
append using "$masterdata\p_service16"
replace year=2016 if year==.
append using "$masterdata\p_service17"
replace year=2017 if year==.
append using "$masterdata\p_service18"
replace year=2018 if year==.
append using "$masterdata\p_service19"
replace year=2019 if year==.
la var service "course numeric code"
la var servicex "course description"

** identify math, generic self-cont (i.e. grd 1-6), spec ed & vocational courses
* subjarea: 01=self-contained, 03=math, 11=vocational, 12=spec ed
* subject: 10=math, 98=generic
tab subjarea year, m
tab subject year, m
tab subject if subjarea=="03", m
tab subjarea if subject=="10", m
tab servicex if subjarea=="01" & subject=="98", m
tab subject if subjarea=="12", m
tab subject if subjarea=="11", m
gen math=(subject=="10")
gen generic=(subjarea=="01" & subject=="98")
gen sped=(subjarea=="12" & math==0)
gen voced=(subjarea=="11" & math==0)
tab servicex year if math==1, m
tab servicex year if generic==1, m
tab servicex year if sped==1, m
tab servicex year if voced==1, m
gen tmp1=(math==1|generic==1|sped==1|voced==1)
tab servicex if tmp1==0, m
tab service if tmp1==0, m

** identify english, science, social studies
gen ela = (subjarea=="02")
gen sci = (subjarea=="04")
gen soc = (subjarea=="05")

** keep only core subjects and generic self-cont courses
keep if (math | ela | sci | soc | generic)
keep service year servicex math ela sci soc generic
order service year servicex math ela sci soc generic
la var math "math course"
la var ela "english language arts course"
la var sci "science course"
la var soc "social studies course"
la var generic "generic course"
save "$WORKING\temp\tracking_4-service.dta", replace


*******************************************
*** Create subject-specific merged datasets
*******************************************

** grab core subject courses and merge to base dataset separately by subject

foreach ss in math ela sci soc {

	use "$WORKING\temp\tracking_4-service.dta", clear
	keep if (`ss' | generic)
	save "$WORKING\temp\tracking_4-serv-`ss'.dta", replace
	
	* loop by year to address memory constraints
	foreach num of numlist 11(1)19 {

		use "$indata\p_course_complete`num'", clear
		gen year = 20`num'
		rename *, lower
		merge m:1 service year using "$WORKING\temp\tracking_4-serv-`ss'.dta"
		assert _merge<=3
		drop if year != 20`num'
		* see what subject/generic courses we are dropping because they do not have a match
		tab servicex if _merge == 2
		drop if _merge == 2
		keep _merge id1 year campus `ss' generic service servicex class_id course_seq student_*_date credit credit_reason 
		assert id1!="" & service!=""

		** rename course-related variables to have shared prefix
		for var `ss' generic service servicex class_id course_seq student*_date credit*: rename X SX

		** keep only courses starting in the "fall" (July-Sep)
		* note: some year-round campuses start at the end of July
		gen month=month(Sstudent_begin_date)
		assert month>=1 & month<=12
		tab month, m
		gen bg_fall=(month==7|month==8|month==9)
		summ bg_fall
		* share of students with any fall courses
		egen tmp1=max(bg_fall), by(id1 campus)
		bys id1 campus: gen tmp2=_n
		tab tmp1 if tmp2==1
		keep if bg_fall==1
		drop month bg_fall tmp*

		** recode variables to 0 or missing for courses not identified 
		* so will keep only one place-holder observation when drop duplicates
		for var S`ss' Sgeneric: replace X=0 if _merge==1
		foreach vv of varlist Sservice* Sclass_id Scourse_seq Scredit* {
			replace `vv' = "" if (_merge == 1)
		}
		foreach vv of varlist Sstud*_date {
			replace `vv' = . if (_merge == 1)
		}

		** drop duplicates
		drop _merge
		duplicates drop
		
		** keep at least one observation per student per campus
		egen tmp1=max(S`ss'), by(id1 campus)
		egen tmp2=max(Sgeneric), by(id1 campus)
		* drop other courses if has one of the identified courses
		drop if (S`ss' == 0) & (Sgeneric == 0) & ((tmp1 == 1) | (tmp2 == 1))
		drop tmp*
		
		if `num'>11{
			append using "$WORKING\temp\tracking_4_A.dta"
		}
		qui compress
		save "$WORKING\temp\tracking_4_A.dta", replace

	}

	** verify that overall course taking looks similar over time
	gen Soth = ((S`ss' + Sgeneric) == 0)
	la var Soth "non-core subject, non-generic course"
	tabstat S`ss' Sgeneric Soth, by(year)

	** merge to base dataset
	merge m:1 id1 campus year using "$WORKING\tracking_3", update keepusing(rctype grade)
	assert (_merge <= 3)
	drop if (_merge == 1)
	* students without matching fall courses may have had spring matches
	* so, could observe rctype 1, 5 and 6 with _merge==2
	tab rctype _merge, m
	gen rctypeflag=(_merge==2)
	 la var rctypeflag "no fall transcript courses, student x campus"
	drop _merge

	** course-taking statistics
	* create flag for each unique student x campus
	assert campus != ""
	bysort id1 campus year: gen tmp1=_n
	tab tmp1, m
	* identify students taking different combinations of courses
	egen tmp2=max(S`ss'), by (id1 campus year)
	egen tmp3=max(Sgeneric), by (id1 campus year)
	egen tmp4=max(Soth), by(id1 campus year)
	gen tmp_m=(tmp2==1&tmp3==0&tmp4==0) if S`ss'!=.
	gen tmp_g=(tmp2==0&tmp3==1&tmp4==0) if S`ss'!=.
	gen tmp_o=(tmp2==0&tmp3==0&tmp4==1) if S`ss'!=.
	gen tmp_mg=(tmp2==1&tmp3==1&tmp4==0) if S`ss'!=.
	egen tmpchk=rsum(tmp_*) if S`ss'!=.
	tab tmpchk
	summ tmp_* if tmp1==1
	tabstat tmp_* if tmp1==1, by(grade)
	
	** drop generic course if has core subject course
	drop if Sgeneric==1 & tmp2==1
	drop tmp*
	
	** merge to base dataset
	merge m:1 id1 campus year using "$WORKING\tracking_3", update
	assert (_merge <= 3)
	drop if (_merge == 1)
	drop _merge
	
	save "$WORKING\tracking_4-`ss'", replace

	** add campus-grade level averages
	assert campus!=""
	duplicates drop id1 campus grade year, force
	collapse (mean) S`ss' Sgeneric Soth, by(campus grade year)
	for any `ss' generic oth: ///
	 la var SX "X avg campus * grade" \ rename SX CGcX
	merge 1:m campus grade year using "$WORKING\tracking_4-`ss'.dta", update
	assert _merge==3
	drop _merge

	order id1 id2 invalid_id1_flag state_assig campus grade year D* P* M* LM* ///
	grade_lo grade_hi att* attdays* sixweek* num_att num_trns rctype* ///
	S* campname C* CG*
	descr
	summ
	save "$WORKING\tracking_4-`ss'.dta", replace

}

log close
